Load necessary packages
library(jsonlite)
library(dplyr)
library(ggplot2)
library(magrittr)
library(ggmap)
library(knitr)
Load training data
KAGGLE <- TRUE
train_path <- ifelse(KAGGLE, "../input/train.json", "data/train.json")
training <- fromJSON(train_path) %>% bind_rows
# Keep list variables
features <- training$features
photos <- training$photos
# Remove list variables from data
training$features <- NULL
training$photos <- NULL
# Convert to data.frame
training <- sapply(training, unlist) %>%
data.frame(., stringsAsFactors = FALSE)
# Add removed variables
training$features <- features
training$photos <- photos
# Clean memory
rm(features)
rm(photos)
Numerical and factor variables
numerical_variables <- c("bathrooms", "bedrooms", "longitude", "latitude", "price")
training[, numerical_variables] %<>% lapply(., as.numeric)
training$interest_level <- as.factor(training$interest_level)
Density plot of distance
# Discard real state far from City Center
ny_outliners_dist <- 0.2
ggplot(training[training$distance_city < ny_outliners_dist, ],
aes(distance_city, color = interest_level)) + geom_density()

# Discard real state far from City Center
ggplot(training, aes(log(distance_city), color = interest_level)) +
geom_density()

NYC satellite map
map <- get_googlemap(zoom = 12,
# Use Alternate New York City Center Coords
center = ny_center %>% as.numeric,
maptype = "satellite", sensor = FALSE)
p <- ggmap(map) +
geom_point(size = 1, data = training,
aes(x = longitude, y = latitude, color = interest_level)) +
facet_grid(facets = . ~ interest_level, scales = "free", space = "free") +
xlab("") + ylab("") + scale_colour_brewer(palette = "Set1")
p

Outliers: find real coordinates
outliers_addrs <- training[training$longitude == 0 |
training$latitude == 0, ]$street_address
outliers_addrs
## [1] "145 28 Street" "Van Sicklen street" "219 E 28th"
## [4] "1632 Madison Ave" "41-42 24th St " "450 East 83rd Street"
## [7] "247 west 87" "118 W 109th" "246 Mott St "
## [10] "21 W 106th" "338 e. 53" "259 Decatur Street"
# addresses are supposed to be in nyc
outliers_ny <- paste(outliers_addrs, ", new york")
# search for geological location from google
outliers_addrs <- data.frame("street_address" = outliers_addrs)
coords <- sapply(outliers_ny,
function(x) geocode(x, source = "google")) %>%
t %>%
data.frame %>%
cbind(outliers_addrs, .)
rownames(coords) <- 1:nrow(coords)
# Display table
kable(coords)
| 145 28 Street |
-73.99244 |
40.74708 |
| Van Sicklen street |
-73.97504 |
40.59679 |
| 219 E 28th |
-73.97982 |
40.74179 |
| 1632 Madison Ave |
-73.94847 |
40.79576 |
| 41-42 24th St |
-73.94131 |
40.75153 |
| 450 East 83rd Street |
-73.94899 |
40.77399 |
| 247 west 87 |
-73.97555 |
40.78888 |
| 118 W 109th |
-73.96273 |
40.8015 |
| 246 Mott St |
-73.99466 |
40.72328 |
| 21 W 106th |
-73.96095 |
40.79874 |
| 338 e. 53 |
-73.96576 |
40.75591 |
| 259 Decatur Street |
-73.93344 |
40.68165 |
Update dataset
training[training$longitude == 0,]$longitude <- coords$lon
training[training$latitude == 0,]$latitude <- coords$lat